import numpy as np
import pandas as pd
import string
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

GET_WORDS_NUM = 10
PREPROCESS_PATH = './news_preprocessing.txt'

""" 取消注释以下代码进行预处理工作, 否则请按照报告说明放置预处理文件复现结果.
DATA_PATH = './news.txt'  
with open(DATA_PATH, encoding='utf-8') as f:
    corpus = f.readlines()
def preprocessing(text):
    # :param text:
    #     对每一行进行预处理.
    # :return:
    #     处理后的每一行.
    text = text.lower()
    puncs = string.punctuation + '‘“”’—'
    numbers = '1234567890'
    for i in puncs + numbers:
        text = text.replace(i, ' ')
    text = re.sub(r'\d +', '', text)

    wordList = nltk.word_tokenize(text)
    filtered = [w for w in wordList if w not in stopwords.words('english')]
    # 仅保留名词或特定POS (这里可选)
    # refiltered = nltk.pos_tag(filtered)
    # filtered = [w for w, pos in refiltered if pos.startswith('NN')]
    # 词干化 (这里可选)
    # ps = PorterStemmer()
    # filtered = [ps.stem(w) for w in filtered]

    return " ".join(filtered)

from tqdm import tqdm
for i, _ in tqdm(enumerate(corpus)):
    corpus[i] = preprocessing(corpus[i])

# 预处理需要大概一个小时.
with open(PREPROCESS_PATH, 'w', encoding='utf-8') as f:
    for i in corpus:
        f.write('%s\n' % i)
"""

with open(PREPROCESS_PATH, encoding='utf-8') as f:
    corpus = f.readlines()
print('数据读取完成!')

from sklearn.feature_extraction.text import CountVectorizer
# 限定term出现次数必须大于2, 保留前10000个.
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                   max_features=10000,
                                   stop_words='english')
word_mat = count_vectorizer.fit_transform(corpus)

from sklearn.decomposition import LatentDirichletAllocation

for n_topic in [5, 10, 20]:
    lda = LatentDirichletAllocation(n_components=n_topic,
                                    max_iter=5,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0)
    lda.fit(word_mat)
    feature_names = count_vectorizer.get_feature_names()

    # The code here is improved from https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730
    print(f'话题个数为 {n_topic} 时:')
    for topic_idx, topic in enumerate(lda.components_):
        print(f'    Topic #{topic_idx}: ', end='')
        print(', '.join([feature_names[i] for i in topic.argsort()[:-GET_WORDS_NUM - 1:-1]]))
    print()